{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 133,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 134,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loading Devices\n",
      "Loading Emails\n",
      "Loading Files\n",
      "Loading Logons\n",
      "Loading Http\n",
      "Finished Loading\n"
     ]
    }
   ],
   "source": [
    "path_to_dataset = \"C:/Users/tabzr/Documents/CMU Dataset/r4.2/\"\n",
    "# path_to_dataset = \"~/Documents/CMU_Dataset/r4.2/\"\n",
    "\n",
    "print(\"Loading Devices\", flush=True)\n",
    "devices = pd.read_csv(path_to_dataset + \"device.csv\", index_col=2)\n",
    "print(\"Loading Emails\", flush=True)\n",
    "emails = pd.read_csv(path_to_dataset + \"email.csv\", index_col=2)\n",
    "print(\"Loading Files\", flush=True)\n",
    "files = pd.read_csv(path_to_dataset + \"file.csv\", index_col=2)\n",
    "print(\"Loading Logons\", flush=True)\n",
    "logons = pd.read_csv(path_to_dataset + \"logon.csv\", index_col=2)\n",
    "print(\"Loading Http\", flush=True)\n",
    "fields = [\"date\", \"user\", \"url\"]\n",
    "https = pd.read_csv(path_to_dataset + \"http.csv\", usecols=fields, index_col=1)\n",
    "print(\"Finished Loading\", flush=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 135,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Converting date strings to Timestamps\n",
      "Finished devices\n",
      "Finished emails\n",
      "Finished files\n",
      "Finished logons\n",
      "Finished https\n",
      "Finished converting dates\n"
     ]
    }
   ],
   "source": [
    "# All dates follow this format\n",
    "dateFormat = \"%m/%d/%Y %H:%M:%S\"\n",
    "\n",
    "print(\"Converting date strings to Timestamps\", flush=True)\n",
    "\n",
    "devices[\"date\"] = pd.to_datetime( devices[\"date\"], format=dateFormat)\n",
    "print(\"Finished devices\", flush=True)\n",
    "\n",
    "emails[\"date\"] = pd.to_datetime( emails[\"date\"], format=dateFormat)\n",
    "print(\"Finished emails\", flush=True)\n",
    "\n",
    "files[\"date\"] = pd.to_datetime( files[\"date\"], format=dateFormat)\n",
    "print(\"Finished files\", flush=True)\n",
    "\n",
    "logons[\"date\"] = pd.to_datetime( logons[\"date\"], format=dateFormat)\n",
    "print(\"Finished logons\", flush=True)\n",
    "\n",
    "https[\"date\"] = pd.to_datetime( https[\"date\"], format=dateFormat)\n",
    "print(\"Finished https\", flush=True)\n",
    "\n",
    "print(\"Finished converting dates\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "features = 0\n",
    "feature_map = {}\n",
    "def addFeature(name):\n",
    "    if name not in feature_map:\n",
    "        global features\n",
    "        feature_map[name] = features\n",
    "        features+=1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "#Logon features\n",
    "# addFeature(\"Weekday_Logon_Normal\") # 9am-5pm\n",
    "# addFeature(\"Weekday_Logon_After\")\n",
    "# addFeature(\"Weekend_Logon\")\n",
    "addFeature(\"Logon\")\n",
    "# addFeature(\"Logoff_03hr\")\n",
    "# addFeature(\"Logoff_36hr\")\n",
    "# addFeature(\"Logoff_69hr\")\n",
    "# addFeature(\"Logoff_9+hr\")\n",
    "addFeature(\"Logoff\") # Calculating length between first logon and logoff is a little tricky \n",
    "\n",
    "def logonFeatures(row):\n",
    "    if row[\"activity\"] == \"Logon\":\n",
    "        if row[\"date\"].weekday() < 5:\n",
    "            if row[\"date\"].hour >= 8 and row[\"date\"].hour < 17:\n",
    "                return feature_map[\"Weekday_Logon_Normal\"]\n",
    "            else:\n",
    "                return feature_map[\"Weekday_Logon_After\"]\n",
    "        else:\n",
    "            return feature_map[\"Weekend_Logon\"]\n",
    "    else: #Is Logoff\n",
    "        return feature_map[\"Logoff\"]\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#Device features\n",
    "addFeature(\"Connect\")\n",
    "# addFeature(\"Connect_Normal\")\n",
    "# addFeature(\"Connect_After\")\n",
    "# addFeature(\"Connect_Weekend\")\n",
    "addFeature(\"Disconnect\")\n",
    "\n",
    "def deviceFeatures(row):\n",
    "    if row[\"activity\"] == \"Connect\":\n",
    "        if row[\"date\"].weekday() < 5:\n",
    "            if row[\"date\"].hour >= 8 and row[\"date\"].hour < 17:\n",
    "                return feature_map[\"Connect_Normal\"]\n",
    "            else:\n",
    "                return feature_map[\"Connect_After\"]\n",
    "        else:\n",
    "            return feature_map[\"Connect_Weekend\"]\n",
    "    else:\n",
    "        return feature_map[\"Disconnect\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# The http web log is too large to extract features from (takes too long)\n",
    "addFeature(\"Website\")\n",
    "\n",
    "def httpFeatures(row):\n",
    "    return feature_map[\"Website\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "addFeature(\"Email\")\n",
    "# addFeature(\"Email_In\") # All recepients are company email addresses\n",
    "# addFeature(\"Email_Out\") # Sent to a non-company email address\n",
    "\n",
    "def emailFeatures(row):\n",
    "    outsider = False\n",
    "    if not pd.isnull(row[\"to\"]):\n",
    "        for address in row[\"to\"].split(\";\"):\n",
    "            if address.endswith(\"dtaa.com\"):\n",
    "                outsider = True\n",
    "                \n",
    "    if not pd.isnull(row[\"cc\"]):\n",
    "        for address in row[\"cc\"].split(\";\"):\n",
    "            if address.endswith(\"dtaa.com\"):\n",
    "                outsider = True\n",
    "                \n",
    "    if not pd.isnull(row[\"bcc\"]):\n",
    "        for address in row[\"bcc\"].split(\";\"):\n",
    "            if address.endswith(\"dtaa.com\"):\n",
    "                outsider = True\n",
    "    if outsider:\n",
    "        return feature_map[\"Email_Out\"]\n",
    "    else:\n",
    "        return feature_map[\"Email_In\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# filenames = np.unique( files[\"filename\"].apply(lambda x: x.split(\".\",1)[1]) )\n",
    "# filenames"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "addFeature(\"File\")\n",
    "# addFeature(\"File_exe\";)\n",
    "# addFeature(\"File_jpg\")\n",
    "# addFeature(\"File_zip\")\n",
    "# addFeature(\"File_txt\")\n",
    "# addFeature(\"File_doc\") #Or pdf\n",
    "\n",
    "def fileFeatures(row):\n",
    "    if row[\"filename\"].endswith(\".exe\"):\n",
    "        return feature_map[\"File_exe\"]\n",
    "    if row[\"filename\"].endswith(\".jpg\"):\n",
    "        return feature_map[\"File_jpg\"]\n",
    "    if row[\"filename\"].endswith(\".zip\"):\n",
    "        return feature_map[\"File_zip\"]\n",
    "    if row[\"filename\"].endswith(\".txt\"):\n",
    "        return feature_map[\"File_txt\"]\n",
    "    else:\n",
    "        return feature_map[\"File_doc\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'Connect': 2,\n",
       " 'Disconnect': 3,\n",
       " 'Email': 5,\n",
       " 'File': 6,\n",
       " 'Logoff': 1,\n",
       " 'Logon': 0,\n",
       " 'Website': 4}"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "feature_map"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 144,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Logon\n",
      "Device\n",
      "Email\n",
      "File\n"
     ]
    }
   ],
   "source": [
    "# Feature extraction\n",
    "\n",
    "cols_to_keep = [\"date\", \"feature\"]\n",
    "\n",
    "# Logons\n",
    "print(\"Logon\", flush=True)\n",
    "logons[\"feature\"] = logons.apply(logonFeatures, axis=1)\n",
    "logons = logons[cols_to_keep]\n",
    "# Devices\n",
    "print(\"Device\", flush=True)\n",
    "devices[\"feature\"] = devices.apply(deviceFeatures, axis=1)\n",
    "devices = devices[cols_to_keep]\n",
    "# Email\n",
    "print(\"Email\", flush=True)\n",
    "emails[\"feature\"] = emails.apply(emailFeatures, axis=1)\n",
    "emails = emails[cols_to_keep]\n",
    "# Files\n",
    "print(\"File\", flush=True)\n",
    "files[\"feature\"] = files.apply(fileFeatures, axis=1)\n",
    "files = files[cols_to_keep]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 145,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Http\n"
     ]
    }
   ],
   "source": [
    "# Http\n",
    "print(\"Http\", flush=True)\n",
    "# https[\"feature\"] = https.apply(httpFeatures, axis=1)\n",
    "https[\"feature\"] = feature_map[\"Website\"]\n",
    "https = https[cols_to_keep]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 146,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# cols_to_keep = [\"date\", \"feature\"]\n",
    "# logons = logons[cols_to_keep]\n",
    "# devices = devices[cols_to_keep]\n",
    "# https = https[cols_to_keep]\n",
    "# emails = emails[cols_to_keep]\n",
    "# files = files[cols_to_keep]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 147,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "joint = pd.concat([logons, devices, https, emails, files])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 148,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Consider dropping stuff earlier since this is very memory intensive\n",
    "# joint.drop([\"activity\", \"attachments\", \"bcc\", \"cc\", \"content\", \"filename\", \"from\", \"id\", \"pc\", \"size\", \"to\", \"user\"], axis=1, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 149,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "joint.sort_values(\"date\", axis=0, ascending=True, inplace=True, kind='quicksort', na_position='last')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 150,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Write to disk as hdf because feature extraction takes a looooooong time\n",
    "joint.to_hdf(\"C:/Users/tabzr/Documents/CMU Dataset/r4.2/r42_features_complex.h5\", \"table\", mode=\"w\")\n",
    "# joint.to_hdf(\"/home/tabz/Documents/CMU_Dataset/r4.2/features_simple.h5\", \"table\", mode=\"w\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
